`summarise()` has grouped output by 'date'. You can override using the
`.groups` argument.
#|> #mutate(date=ifelse(seq_along(date) %% 2 == 1, date, "")) |>#pivot_longer(cols=3:6, names_to="measure", values_to="score")tmp |>pivot_longer(cols=3:7, names_to="measure", values_to="score") |>ggplot(aes(x=date, y=score, group=org_type, color=as.factor(org_type))) +geom_point(size=1.5) +geom_smooth(method=lm, linewidth=1.5, se=F) +#scale_x_date(date_breaks = "1 year", date_labels = "%Y") + #scale_x_discrete(labels=ifelse(seq_along(tmp$date) %% 2 == 1, tmp$date, "")) +theme(axis.text.x=element_text(angle=90, hjust=1)) +facet_wrap(vars(measure), scales="free") +labs(title="measures by time",subtitle="sentiment measures and engagement",caption="sentiment is increasing over time, engagment is decreasing",x="date of post",y="score",color="organization type" ) +theme(#legend.position = "top", # Reposition legend to top#legend.key.size = unit(1, "cm"), # Change size of legend keyslegend.text =element_text(size =10, face ="bold") # Change legend text appearance )
`geom_smooth()` using formula = 'y ~ x'
sentiment mått vs engagement index
sentiment extremer + exempel poster
tmp = dtp |>select(c(19,22,27,14,23,25,11,6)) |>mutate(sa_int_rel_wc=sa_int_abs/wc) |>mutate(m0_e_index_rel_follow=e_index/follow) |>group_by(month) |>summarize(m1_sa_val=mean(sa_val),m2_sa_int_abs=mean(sa_int_abs),m3_wc=mean(wc),m4_sa_int_rel_wc=mean(sa_int_rel_wc),#date=first(date)m0_e_index=mean(m0_e_index_rel_follow))tmp |>pivot_longer(cols=2:5, names_to="measure", values_to="score") |>ggplot(aes(x=score, y=m0_e_index)) +geom_point(size=1.5) +geom_smooth(method=lm, linewidth=1.5, se=F) +facet_wrap(~measure, scales="free", nrow =2, ncol =2) +labs(title="engagement by sentiment",subtitle="association between engagement and sentiment",caption="flat or negative trends",x="sentiment",y="engagement",#color="organization type" ) +theme(#legend.position = "top", # Reposition legend to top#legend.key.size = unit(1, "cm"), # Change size of legend keyslegend.text =element_text(size =10, face ="bold") # Change legend text appearance )
`geom_smooth()` using formula = 'y ~ x'
240104: sentiment, top and bottom
# get sentence tokenized dataframefp ="../tmp/fb-sa-231228-010.tsv"tmp =read.table(fp, sep='\t', quote="", comment.char="", header=T, strip.white=TRUE, stringsAsFactors=FALSE)# Create a single result dataframeresult_df <- tmp |>left_join(dtp |>select(id, lang), by="id") |>filter(lang=="sv") |># Extract the 10 texts with the highest sentiment scoresarrange(desc(sa_numeric), desc(sa_score)) %>%mutate(category ="top") %>%head(10) %>%# Bind with the 10 texts with the lowest sentiment scoresbind_rows(tmp |>left_join(dtp |>select(id, lang), by="id") |>filter(lang=="sv") |>arrange(sa_numeric, desc(sa_score)) %>%mutate(category ="bottom") %>%head(10)) |>select(id, sentence, sa_label)#install.packages('simplermarkdown', lib="~/lib/r-cran")#library(simplermarkdown)cat(simplermarkdown::md_table(result_df))
id
sentence
sa_label
rfsl.forbundet-10157198447557301
Happy pride!
positive
Nordensark-3258235214234799
Thank you all for that!
positive
rfsl.forbundet-10157211573442301
Tack grattis och happy pride!
positive
kalmarstadsmission-3405318379530319
Det tycker vi e bra!🌏 Var med och bidra till hållbarhet - handla på Kalmar Stadsmission Second Hand - öppet imorgon lördag 10-15!
positive
kfumsverige-3915350888535051
Så inspirerande och så viktigt!
positive
Lakarmissionen-3129368217161452
Årets viktigaste insamling!
positive
uppsalastadsmission-3176615612383966
Handla för andra och stötta en god sak i sommar!
positive
raddabarnen-10157641596116794
Årets viktigaste böcker är här!
positive
kalmarstadsmission-3193348157394010
Second hand till bästa pris!!!
positive
RiksforbundetHjartLung-3554658201222473
Det har gått bra och varit värdefullt.
positive
raddningsmissionen-10158140807240345
Fattigdom är ett stort problem världen över cirka 10% av världens befolkning lever i så kallad extrem fattigdom.
negative
HRWSweden-1017159588800525
Det kritiska uttalandet leddes av Tysklands FN-ambassadör som bland annat tog upp Kinas polisbrutalitet mot demokratiprotester i Hong Kong och Tibet samt storskalig diskriminering av Kinas muslimska minoritet i provinsen Xinjiang.
negative
eskilstunastadsmission-1762474967239976
Kanske tristess.
negative
raddabarnen-10157498109326794
Situationen är desperat!
negative
AnhorigasRiksforbund-1709548629184479
Där är den mest kritiska posten enligt mig den som visar att 51 procent av de som svarade i undersökningen Inte hade blivit erbjuden stöd och inte heller vetat vart de skulle vända sig.
negative
AnhorigasRiksforbund-1769080763231265
Vi vill att användarna är kritiska till saker de tycker inte fungerar är otydligt formulerat saknas eller på andra sätt kan bli bättre!
negative
rodakorset-3460295044001098
Stor explosion i Beirut – många döda och skadade.
negative
AmnestySverige-10158372520624788
🇺🇸 Det dödliga polisvåldet och misslyckandet i att stoppa den systematiska rasismen i USA kränker svarta amerikaners mänskliga rättigheter inklusive rätten till liv och rätten att inte diskrimineras.
negative
AmnestySverige-10158559101704788
Läget är kritiskt och nu KRÄVER VI KRAFTTAG FRÅN UD!
negative
rfsu.se-3699723843390967
- Antitrans-allianser och den könsbekräftande vården - Corona - a crisis exploited by populists - Fittfakta deluxe!
# inferential analysism1 =lmer(sa_numeric_sum ~ time + follow + wc + lang + (1|org_type), data=dts)
Warning: Some predictor variables are on very different scales: consider
rescaling
#m1 = lm(sa_numeric_sum ~ time + follow + wc + lang, data=dts)#m1 = lmer(sa_val ~ time + follow + wc + lang + (1|org_type), data=dts)#summary(m1)
Linear mixed model fit by REML ['lmerMod']
Formula: sa_numeric_sum ~ time + follow + wc + lang + (1 | org_type)
Data: dts
REML criterion at convergence: 452753.9
Scaled residuals:
Min 1Q Median 3Q Max
-16.5404 -0.5866 0.0118 0.5874 22.5713
Random effects:
Groups Name Variance Std.Dev.
org_type (Intercept) 0.3036 0.551
Residual 6.1671 2.483
Number of obs: 97188, groups: org_type, 8
Fixed effects:
Estimate Std. Error t value
(Intercept) 2.255e+00 3.381e-01 6.671
time -5.278e-10 1.793e-10 -2.943
follow -3.457e-06 1.239e-07 -27.909
wc -4.003e-04 1.632e-04 -2.452
langsv -1.243e-01 3.271e-02 -3.801
Correlation of Fixed Effects:
(Intr) time follow wc
time -0.812
follow 0.009 -0.022
wc 0.110 -0.147 0.009
langsv -0.121 0.046 -0.035 -0.207
fit warnings:
Some predictor variables are on very different scales: consider rescaling
discussion
some points here
231229: transformer sentiment
# Load the necessary library#library(readr)# List of TSV file paths#file_paths <- c('path/to/file1.tsv', 'path/to/file2.tsv', 'path/to/file3.tsv') # replace with #actual file paths# Define the directory containing the TSV filesdirectory <-"../csv"# replace with the actual directory path# Define the pattern to match numbered TSV files with two leading zeroespattern <-"fb-sa-231228-0[0-9]+\\.tsv$"# This pattern matches files like 'file_001.tsv', 'file_002.tsv', etc.# Create a list of file paths for TSV files matching the patternfile_paths <-list.files(directory, pattern = pattern, full.names =TRUE)# Print the list of file pathsprint(file_paths)
################################################# Function to read each TSV file into a dataframeread_tsv_file <-function(fn) {# read_tsv(file_path)read.table(fn, sep='\t', header=T, strip.white=TRUE, stringsAsFactors=FALSE)}# Read each file and store the dataframes in a listlist_of_dfs <-lapply(file_paths, read_tsv_file)# Combine all dataframes into onecombined_df <-do.call(rbind, list_of_dfs)# Assuming you have a dataframe df with an ID column named 'id'# and other columns you want to aggregate# Separating duplicates and uniquesduplicates <- combined_df %>%group_by(id) %>%filter(n() >1)uniques <- combined_df %>%group_by(id) %>%filter(n() ==1) %>%ungroup()# Perform your desired aggregation on the duplicates# For example, if you want to calculate the mean of a column named 'value'aggregated_duplicates <- duplicates %>%group_by(id) %>%summarize(sa_numeric_mean=mean(sa_numeric_mean),sa_numeric_sum=sum(sa_numeric_sum),sa_scaled_mean=mean(sa_scaled_mean),sa_scaled_sum=mean(sa_scaled_sum),sentence_count=max(sentence_count)) %>%ungroup()# Bind the aggregated duplicates back with the uniquescombined_df <-bind_rows(aggregated_duplicates, uniques)# View the final dataframedim(combined_df)
[1] 97188 6
# View the combined dataframe#paste(dim(dtp), dim(combined_df))# Find duplicates in the specified column#dtp[duplicated(dtp$id) | duplicated(dtp$id, fromLast = TRUE), ]#combined_df[duplicated(combined_df$id) | duplicated(combined_df$id, fromLast = TRUE), ]
#fn = "../csv/fb-sa-231228-001.tsv"#tmp = read.table(fn, sep='\t', header=T, strip.white=TRUE, stringsAsFactors=FALSE)# Find rows in df1 that don't have matching keys in df2#non_matching_df1 <- anti_join(dtp, combined_df, by = "id")# Find rows in df2 that don't have matching keys in df1#non_matching_df2 <- anti_join(combined_df, dtp, by = "id")# View the results#print(non_matching_df1)#print(non_matching_df2)# perform inner joindts =inner_join(combined_df, dtp, by="id") |>as_tibble()dts |>names()
Linear mixed model fit by REML ['lmerMod']
Formula: e_index ~ sa_val + (1 | org_type)
Data: dtp
REML criterion at convergence: 1612004
Scaled residuals:
Min 1Q Median 3Q Max
-0.461 -0.235 -0.115 -0.030 112.746
Random effects:
Groups Name Variance Std.Dev.
org_type (Intercept) 17844 133.6
Residual 934975 966.9
Number of obs: 97188, groups: org_type, 8
Fixed effects:
Estimate Std. Error t value
(Intercept) 215.7997 47.4378 4.549
sa_val -1.7923 0.6144 -2.917
Correlation of Fixed Effects:
(Intr)
sa_val -0.034
Warning: Some predictor variables are on very different scales: consider
rescaling
#summary(m2)
Linear mixed model fit by REML ['lmerMod']
Formula: e_index ~ sa_val + follow + month + (1 | org_type)
Data: dtp
REML criterion at convergence: 1607403
Scaled residuals:
Min 1Q Median 3Q Max
-2.177 -0.182 -0.091 0.009 115.298
Random effects:
Groups Name Variance Std.Dev.
org_type (Intercept) 11092 105.3
Residual 891609 944.3
Number of obs: 97188, groups: org_type, 8
Fixed effects:
Estimate Std. Error t value
(Intercept) 2.269e+02 3.796e+01 5.978
sa_val 1.451e+00 6.019e-01 2.411
follow 3.122e-03 4.720e-05 66.132
month -3.611e+00 1.773e-01 -20.361
Correlation of Fixed Effects:
(Intr) sa_val follow
sa_val -0.043
follow -0.039 0.079
month -0.152 -0.011 -0.021
fit warnings:
Some predictor variables are on very different scales: consider rescaling
# load functionssource("socm_sa_functions.R")# aggregate by orgtmp =dvs_org_aggregate(dtp)
`summarise()` has grouped output by 'org'. You can override using the `.groups`
argument.
`summarise()` has grouped output by 'org'. You can override using the `.groups`
argument.
`summarise()` has grouped output by 'org'. You can override using the `.groups`
argument.
Loading required package: OpenMx
To take full advantage of multiple cores, use:
mxOption(key='Number of Threads', value=parallel::detectCores()) #now
Sys.setenv(OMP_NUM_THREADS=parallel::detectCores()) #before library(OpenMx)
Attaching package: 'OpenMx'
The following objects are masked from 'package:Matrix':
%&%, expm
dtl %>%filter(org %in%c("ClownerutanGranser","lakareutangranser")) %>%# select just two orgsggplot(aes(month, e_index_mean, color = org)) +geom_point() +# points for observations of engagementgeom_smooth(method = lm, se =FALSE) +# linear linetheme_bw() +# nice themelabs(x ="month", y ="e_index_mean") # nice labels
`geom_smooth()` using formula = 'y ~ x'
230315: bivariate org type
#wide to long formattmp = dtp %>%select(c(1,27,6,11,14,24)) %>%pivot_longer(cols=3:6, names_to="measure", values_to="value")#facet plot bivariatep =ggplot(tmp, aes(x=org_type, y=value)) +geom_bar(position='dodge', stat='summary', fun='mean') +facet_wrap(vars(measure), scales="free")p
230314: resume analysis
if (T) {knitr::knit_exit()#exit()#q()#stop("here..")}